#Loading required packages
library(plyr)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## The following object is masked from 'package:plyr':
## 
##     empty
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(e1071)
library(data.table)
library(ellipse)

LOADING THE DATA

loading dataset

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RSQLite)
#load soccer dataset
soccerData <- dbConnect(SQLite(), dbname="/Users/harmanbhullar/Documents/database.sqlite")
#list all tables in dataset
dbListTables(soccerData)
## [1] "Country"           "League"            "Match"            
## [4] "Player"            "Player_Attributes" "Team"             
## [7] "Team_Attributes"   "sqlite_sequence"
#loading all tables of interest
#player table contains data about 11060 players
player <- tbl_df(dbGetQuery(soccerData,"SELECT * FROM player"))
#player_Attributes consists of all the features of players
player_Attributes <- tbl_df(dbGetQuery(soccerData,"SELECT * FROM player_Attributes"))
#team table consists of information about the 299 teams
team <- tbl_df(dbGetQuery(soccerData,"SELECT * FROM Team"))
#team_Attributes consists of all the features of all teams
team_Attributes <- tbl_df(dbGetQuery(soccerData,"SELECT * FROM Team_Attributes"))
# Match table contains the information about the all matches from 2008 to 2016
Match <- tbl_df(dbGetQuery(soccerData,"SELECT * FROM Match"))

Summary: since soccer database has 8 tables.Here, I am interested in only 5(team,team_Attributes,player,player_Attribute and Match) tables. therefore i load only five tables of interest as i am interested to analyse the teams level and players level features and to predict the results i need match table which gives the results of matches w.r.t. teams api id,date of match and match api id. #EXPLORING AND VISUALISING THE DATA ## analyse and visualise the data using box plots visulaising the features of dataset at players levels

#browse stats table
str(player_Attributes)
## Classes 'tbl_df', 'tbl' and 'data.frame':    183978 obs. of  42 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ player_fifa_api_id : int  218353 218353 218353 218353 218353 189615 189615 189615 189615 189615 ...
##  $ player_api_id      : int  505942 505942 505942 505942 505942 155782 155782 155782 155782 155782 ...
##  $ date               : chr  "2016-02-18 00:00:00" "2015-11-19 00:00:00" "2015-09-21 00:00:00" "2015-03-20 00:00:00" ...
##  $ overall_rating     : int  67 67 62 61 61 74 74 73 73 73 ...
##  $ potential          : int  71 71 66 65 65 76 76 75 75 75 ...
##  $ preferred_foot     : chr  "right" "right" "right" "right" ...
##  $ attacking_work_rate: chr  "medium" "medium" "medium" "medium" ...
##  $ defensive_work_rate: chr  "medium" "medium" "medium" "medium" ...
##  $ crossing           : int  49 49 49 48 48 80 80 79 79 79 ...
##  $ finishing          : int  44 44 44 43 43 53 53 52 51 51 ...
##  $ heading_accuracy   : int  71 71 71 70 70 58 58 57 57 57 ...
##  $ short_passing      : int  61 61 61 60 60 71 71 70 70 70 ...
##  $ volleys            : int  44 44 44 43 43 40 32 29 29 29 ...
##  $ dribbling          : int  51 51 51 50 50 73 73 71 71 71 ...
##  $ curve              : int  45 45 45 44 44 70 70 68 68 68 ...
##  $ free_kick_accuracy : int  39 39 39 38 38 69 69 69 69 69 ...
##  $ long_passing       : int  64 64 64 63 63 68 68 68 68 68 ...
##  $ ball_control       : int  49 49 49 48 48 71 71 70 70 70 ...
##  $ acceleration       : int  60 60 60 60 60 79 79 79 79 79 ...
##  $ sprint_speed       : int  64 64 64 64 64 78 78 78 78 78 ...
##  $ agility            : int  59 59 59 59 59 78 78 78 78 78 ...
##  $ reactions          : int  47 47 47 46 46 67 67 67 67 67 ...
##  $ balance            : int  65 65 65 65 65 90 90 90 90 90 ...
##  $ shot_power         : int  55 55 55 54 54 71 71 71 71 71 ...
##  $ jumping            : int  58 58 58 58 58 85 85 84 84 84 ...
##  $ stamina            : int  54 54 54 54 54 79 79 79 79 79 ...
##  $ strength           : int  76 76 76 76 76 56 56 56 56 56 ...
##  $ long_shots         : int  35 35 35 34 34 62 60 59 58 58 ...
##  $ aggression         : int  71 71 63 62 62 68 68 67 67 67 ...
##  $ interceptions      : int  70 70 41 40 40 67 67 66 66 66 ...
##  $ positioning        : int  45 45 45 44 44 60 60 58 58 58 ...
##  $ vision             : int  54 54 54 53 53 66 66 65 65 65 ...
##  $ penalties          : int  48 48 48 47 47 59 59 59 59 59 ...
##  $ marking            : int  65 65 65 62 62 76 76 76 76 76 ...
##  $ standing_tackle    : int  69 69 66 63 63 75 75 75 75 75 ...
##  $ sliding_tackle     : int  69 69 69 66 66 78 78 78 78 78 ...
##  $ gk_diving          : int  6 6 6 5 5 14 14 14 14 14 ...
##  $ gk_handling        : int  11 11 11 10 10 7 7 7 7 7 ...
##  $ gk_kicking         : int  10 10 10 9 9 9 9 9 9 9 ...
##  $ gk_positioning     : int  8 8 8 7 7 9 9 9 9 9 ...
##  $ gk_reflexes        : int  8 8 8 7 7 12 12 12 12 12 ...
#extract only the columns of interest which are the features of players and put in table plr_features.
plr_features <- (player_Attributes[c(3,4,5, 6, 10:42)])
#plot boxplot to visualise the features 
boxplot(plr_features[,3:37], las = 2,main="boxplot for all features of players")

Summary: In above boxplot, there are so many variables who has alot of outliers in both direction minimum and maximum. which means that this dataset has so many variation which could be possible as this data is quite big and has featues for almost 11k players. It can be clearly seen that finishing, volleys, curve, gk_accuracy, long-shots, interception,marking, standing_tackle,sliding_tacle has no outlier at all which means all values are near to mean value. These features could be considered as good to anlayse the features and performance of players. howeever, there are some features like gk_diving,gk_handling,gk_positioning and gk_reflexes has least variability but has a lot of outlier above the mean value which are scatter above the whole axis. On the other side, marking,standing_tackle, sliding_tacle has more variablity with no outlier at all.Crossing can be also good feature to be considered as has very small number of outliers below the mean. Most of features has outliers below the mean value.

visualise the feartures of players:

In above case there was so many outliers, as there was multple values for all features of each players, so by taking the mean of all features of each player.it would gives good results.

plr_feature <- (player_Attributes[c(3,5, 6, 10:42)])
library(data.table)
#coverting the dataframe "plr_feature"" into data.table for further calculations
setDT(plr_feature)
# taking the mean(average) of all the features of players w.r.t player_api_id and storing the results into mean_plr_features dataframe
mean_plr_features <-plr_feature[ ,lapply(.SD, mean),  by = .(player_api_id)]
#boxplot to visualise the changes from previous visulaisation.
boxplot(mean_plr_features[,2:36], las = 2,main="boxplot for all features of players after taking mean of all features w.r.t player api id")

Summary: Here this boxplot for all features is better than previsous as we take mean of all features w.r.t the p;ayer api id and in this boxplot there is less number of outliers and more clear visualisation than the previous one. It can be clearly seen that crossing, finishing, volleys, curve, gk_accuracy, long-shots, interception,marking, standing_tackle,sliding_tacle has no outlier at all which means all values are near to mean value. These features could be considered as good to anlayse the features and performance of players. howeever, there are some features like gk_diving,gk_handling,gk_positioning and gk_reflexes has least variability but has a lot of outlier above the mean value which are scatter above the whole axis. On the other side, marking,standing_tackle, sliding_tacle has more variablity with no outlier at all.Crossing can be also good feature to be considered as has very small number of outliers below the mean. Most of features has outliers below the mean value. Hence, I have shortlisted important features overall rating, potentail, crossing, finishing, heading_accurancy, short_passing, volleys, dribbling, curve, free_kicking_accuracy, sprint_speed, agility, stamina, long_shorts, penalities, marking, standing_tackle, sliding_tackle, gk_diving, gk_handling for further visualisations. ## Analyse the features and performance of players through match data.

#checking what is available inside the match table for players and to observe how to connect match table and players
str(Match)
## Classes 'tbl_df', 'tbl' and 'data.frame':    25979 obs. of  115 variables:
##  $ id              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ country_id      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ league_id       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ season          : chr  "2008/2009" "2008/2009" "2008/2009" "2008/2009" ...
##  $ stage           : int  1 1 1 1 1 1 1 1 1 10 ...
##  $ date            : chr  "2008-08-17 00:00:00" "2008-08-16 00:00:00" "2008-08-16 00:00:00" "2008-08-17 00:00:00" ...
##  $ match_api_id    : int  492473 492474 492475 492476 492477 492478 492479 492480 492481 492564 ...
##  $ home_team_api_id: int  9987 10000 9984 9991 7947 8203 9999 4049 10001 8342 ...
##  $ away_team_api_id: int  9993 9994 8635 9998 9985 8342 8571 9996 9986 8571 ...
##  $ home_team_goal  : int  1 0 0 5 1 1 2 1 1 4 ...
##  $ away_team_goal  : int  1 0 3 0 3 1 2 2 0 1 ...
##  $ home_player_X1  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X2  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X3  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X4  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X5  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X6  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X7  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X8  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X9  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X10 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_X11 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X1  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X2  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X3  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X4  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X5  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X6  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X7  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X8  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X9  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X10 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_X11 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y1  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y2  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y3  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y4  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y5  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y6  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y7  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y8  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y9  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y10 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_Y11 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y1  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y2  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y3  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y4  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y5  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y6  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y7  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y8  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y9  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y10 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_Y11 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_1   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_2   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_3   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_4   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_5   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_6   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_7   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_8   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_9   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_10  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ home_player_11  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_1   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_2   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_3   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_4   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_5   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_6   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_7   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_8   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_9   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_10  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ away_player_11  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ goal            : chr  NA NA NA NA ...
##  $ shoton          : chr  NA NA NA NA ...
##  $ shotoff         : chr  NA NA NA NA ...
##  $ foulcommit      : chr  NA NA NA NA ...
##  $ card            : chr  NA NA NA NA ...
##  $ cross           : chr  NA NA NA NA ...
##  $ corner          : chr  NA NA NA NA ...
##  $ possession      : chr  NA NA NA NA ...
##  $ B365H           : num  1.73 1.95 2.38 1.44 5 4.75 2.1 3.2 2.25 1.3 ...
##  $ B365D           : num  3.4 3.2 3.3 3.75 3.5 3.4 3.2 3.4 3.25 5.25 ...
##  $ B365A           : num  5 3.6 2.75 7.5 1.65 1.67 3.3 2.2 2.88 9.5 ...
##  $ BWH             : num  1.75 1.8 2.4 1.4 5 4.85 2.05 2.55 2.3 1.25 ...
##  $ BWD             : num  3.35 3.3 3.3 4 3.5 3.4 3.25 3.3 3.25 5 ...
##  $ BWA             : num  4.2 3.95 2.55 6.8 1.6 1.65 3.15 2.4 2.7 10 ...
##  $ IWH             : num  1.85 1.9 2.6 1.4 4 3.7 1.85 2.4 2.1 1.3 ...
##  $ IWD             : num  3.2 3.2 3.1 3.9 3.3 3.2 3.2 3.2 3.1 4.2 ...
##  $ IWA             : num  3.5 3.5 2.3 6 1.7 1.8 3.5 2.4 3 8 ...
##  $ LBH             : num  1.8 1.9 2.5 1.44 4 5 1.83 2.5 2.25 1.25 ...
##  $ LBD             : num  3.3 3.2 3.2 3.6 3.4 3.25 3.3 3.2 3.2 4.5 ...
##  $ LBA             : num  3.75 3.5 2.5 6.5 1.72 1.62 3.6 2.5 2.75 10 ...
##  $ PSH             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PSD             : num  NA NA NA NA NA NA NA NA NA NA ...
##   [list output truncated]

Summary: Unfortunately, in match table there is no information is available for players in the the match table. Here all the columns for home and away players is empty. so, there is no scope to connect the player table with match to analyse the charcteristics of player through the performance in matches.

visualise players features by tracking the performance over the time.

As there is no way to connect the players attributes and match table so here i am going to analyse the performance of players by visualising thefeatures with time(in years)

#loading required packages
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:plyr':
## 
##     here
## The following object is masked from 'package:base':
## 
##     date
#convert the date column into year by year() function as i am interested in only years  
plr_features$year <- year(plr_features$date)
#subsetting datsset as per requirement of columns(taking only shortlisted features of players)
subset_plr <- plr_features[c(3:12,16,17,22:24,29:34,38)]
#melting the dataset to reshape it
melted <- melt(subset_plr, id=c("year"))
# taking mean for values as there are so many values for each player
melted<- aggregate( formula = value~year+variable, data = melted, FUN = mean )
# plotting all the features of players on the same plot
ggplot(melted, aes(year,value, col=variable)) + 
  geom_point() + 
  stat_smooth() + ggtitle("feature visualisation for all player over the time")
## `geom_smooth()` using method = 'loess'

Summary: Here, as in above figure of visualisation for all player over the time, interesting fact is there was major drop can be seen in the penalities over time which means that players improve their mistakes over the time as penalties decreases over the time.Moreover, gk_diving also decreses over the time period. on the other side, short_passing, dribbling, free_kick_accuracy and strengh improves over the time as there was increses can be observed in the plot over the time. however, rest of features reamins almost stable over the time period. ## visualse top 10 players performance over time now, here i am only interested in top players performance over the time period. I am going to visualise the performance according to the characteristics of players over the time for top 5 players.

#subetting the features of players by taking only shortlisted features
subset_plrs <- plr_features[c(1,3:12,16,17,22:24,29:34,38)]
#melting the dataset to reshape 
melt_plr <- melt(subset_plrs, id=c("year","player_api_id"))
melt_plr<-aggregate( formula = value~year+variable+player_api_id, data = melt_plr, FUN = mean )
library(dplyr)
#merging plr_feature table and player by player_api_id to get the names of players
name_plr <- merge(plr_features,player,by="player_api_id")
# get the latest data about players 
latest<-name_plr %>% group_by(player_api_id) %>% filter(date == max(date))
# extract the top 10 players from latest overall rating
top10_plrs <- latest %>% arrange(desc(overall_rating)) %>% 
  head(n = 10)
# extracting the player name and rank with player api id  for top 10 players and storing results into top10_plr
top10_plr <- top10_plrs[c(1,40)]
head(top10_plr)
## # A tibble: 6 x 2
## # Groups:   player_api_id [6]
##   player_api_id       player_name
##           <int>             <chr>
## 1         30981      Lionel Messi
## 2         30893 Cristiano Ronaldo
## 3         19533            Neymar
## 4         27299      Manuel Neuer
## 5         40636       Luis Suarez
## 6         30834      Arjen Robben
#merging two tables by player_api_id
top10 <-merge(top10_plr,melt_plr
         , by= c("player_api_id"))
#sutset the data for top number 1 player i.e Lionel Messi 
plr_1 <- subset(top10,player_name=="Lionel Messi")
#sutset the data for top number 2 player i.e Cristiano Ronaldo
plr_2 <-subset(top10,player_name=="Cristiano Ronaldo")
#sutset the data for top number 3 player i.e Neymar
plr_3 <-subset(top10,player_name=="Neymar")
#sutset the data for top number 4 player i.e Manuel Neuer
plr_4 <-subset(top10,player_name=="Manuel Neuer")
#sutset the data for top number 5 player i.e Luis Suarez
plr_5 <-subset(top10,player_name=="Luis Suarez")
# plotting  all features of each players on the same plot 
ggplot(plr_1, aes(year,value, col=variable) )+ 
  geom_point() + 
  stat_smooth() + ggtitle("feature visualisation for Lionel Messi number#1 player" )
## `geom_smooth()` using method = 'loess'

ggplot(plr_2, aes(year,value, col=variable)) + 
  geom_point() + 
  stat_smooth()+ ggtitle("feature visualisation for Cristiano Ronaldo number#2 player") 
## `geom_smooth()` using method = 'loess'

ggplot(plr_3, aes(year,value, col=variable)) + 
  geom_point() + 
  stat_smooth()+ ggtitle("feature visualisation for Neymar number#3 player") 
## `geom_smooth()` using method = 'loess'

ggplot(plr_4, aes(year,value, col=variable)) + 
  geom_point() + 
  stat_smooth()+ ggtitle("feature visualisation for Manuel Neuer number#4 player") 
## `geom_smooth()` using method = 'loess'

ggplot(plr_5, aes(year,value, col=variable)) + 
  geom_point() + 
  stat_smooth()+ ggtitle("feature visualisation for Luis Suarez number#5 player") 
## `geom_smooth()` using method = 'loess'

Summary: Lionel Messi is number#1 player according to latest stats of players. Messi improves the mistakes over the time as there in the plot drop in the penalities can be seen. Also, he improves heading accuracy, overall rating, stamina, free kicking accuracy and sprint speed over the time as these features are increasing over the time period. But there was fluctuations can be seen in strengh overall improving. on the other side, marking decreses till 2012 and thereafter, it increases till end. Number #2 player, Cristiano Ronaldo’s overall performance remains same over the time. but he get better with long shots over the time and dribbling and gk handling became weaker with time as drop can be seen with the time. however, penalities increses with the time. Number #3 player Neymar improves his stamina, kick accuracy and long shots with the time as seen increasing over the time but gk handling lacks with time and rest of features remains stable. Manuel Neuer number #4 most interesting fact can be seen that Neuer improves his mistakes ovet the time as major drop in penalities can be seen in the plot. his agility, stamina gets weak with the time as plot gives decresing trend. But he work on strenght, shot passing and handling accuracy and gets better with time as increasing trend can be noticed in graph. Luis Suarez number #5 improves standing tackle, sliding tackle, stamina and shot passing incresing trend is seen and here almost all features either getting better or remains same but nothing get worst with the time.

visulaisung the features at teams levels

Now i am moving to teams level visualisation. First of all i am going to check the variation of different features with the help of box plot.

#browse stats table to check what is inside the table
str(team_Attributes)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1458 obs. of  25 variables:
##  $ id                            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ team_fifa_api_id              : int  434 434 434 77 77 77 77 77 77 614 ...
##  $ team_api_id                   : int  9930 9930 9930 8485 8485 8485 8485 8485 8485 8576 ...
##  $ date                          : chr  "2010-02-22 00:00:00" "2014-09-19 00:00:00" "2015-09-10 00:00:00" "2010-02-22 00:00:00" ...
##  $ buildUpPlaySpeed              : int  60 52 47 70 47 58 62 58 59 60 ...
##  $ buildUpPlaySpeedClass         : chr  "Balanced" "Balanced" "Balanced" "Fast" ...
##  $ buildUpPlayDribbling          : int  NA 48 41 NA NA NA NA 64 64 NA ...
##  $ buildUpPlayDribblingClass     : chr  "Little" "Normal" "Normal" "Little" ...
##  $ buildUpPlayPassing            : int  50 56 54 70 52 62 45 62 53 40 ...
##  $ buildUpPlayPassingClass       : chr  "Mixed" "Mixed" "Mixed" "Long" ...
##  $ buildUpPlayPositioningClass   : chr  "Organised" "Organised" "Organised" "Organised" ...
##  $ chanceCreationPassing         : int  60 54 54 70 53 45 40 56 51 45 ...
##  $ chanceCreationPassingClass    : chr  "Normal" "Normal" "Normal" "Risky" ...
##  $ chanceCreationCrossing        : int  65 63 63 70 48 70 50 68 72 35 ...
##  $ chanceCreationCrossingClass   : chr  "Normal" "Normal" "Normal" "Lots" ...
##  $ chanceCreationShooting        : int  55 64 64 70 52 55 55 57 63 55 ...
##  $ chanceCreationShootingClass   : chr  "Normal" "Normal" "Normal" "Lots" ...
##  $ chanceCreationPositioningClass: chr  "Organised" "Organised" "Organised" "Organised" ...
##  $ defencePressure               : int  50 47 47 60 47 40 42 41 49 30 ...
##  $ defencePressureClass          : chr  "Medium" "Medium" "Medium" "Medium" ...
##  $ defenceAggression             : int  55 44 44 70 47 40 42 42 45 70 ...
##  $ defenceAggressionClass        : chr  "Press" "Press" "Press" "Double" ...
##  $ defenceTeamWidth              : int  45 54 54 70 52 60 60 60 63 30 ...
##  $ defenceTeamWidthClass         : chr  "Normal" "Normal" "Normal" "Wide" ...
##  $ defenceDefenderLineClass      : chr  "Cover" "Cover" "Cover" "Cover" ...
#subsetting the dataset by selecting required columns of table and storing into team_features table.
team_features <- (team_Attributes[c(3,4,5,7,9,12,14,16,19,21,23)])
#convert the dataframe into the data.table
setDT(team_features)
#boxplot for team features
boxplot(team_features[, 3:10], las = 2, main="Boxplot of all features of teams")

Summary: Here all features of all teams are stable. if i compare the all features with each other, then most variant features are play speed, play passing and crossing. also, there are some outlier exists in all cases except play speed. moreover i am goint to exclude dribbling as there are so many NA/null values are in dribbling. ## extract data from match table for home teams win loss or draw First of all i am going to extract the results for home from match table as it has values for number of goals for home and away country.

#subsetting the table by selecting required rows and storing into datafram a
a <- Match[,6:11]
# stats tables to check what (contents of table) is available inside table Match 
str(a)
## Classes 'tbl_df', 'tbl' and 'data.frame':    25979 obs. of  6 variables:
##  $ date            : chr  "2008-08-17 00:00:00" "2008-08-16 00:00:00" "2008-08-16 00:00:00" "2008-08-17 00:00:00" ...
##  $ match_api_id    : int  492473 492474 492475 492476 492477 492478 492479 492480 492481 492564 ...
##  $ home_team_api_id: int  9987 10000 9984 9991 7947 8203 9999 4049 10001 8342 ...
##  $ away_team_api_id: int  9993 9994 8635 9998 9985 8342 8571 9996 9986 8571 ...
##  $ home_team_goal  : int  1 0 0 5 1 1 2 1 1 4 ...
##  $ away_team_goal  : int  1 0 3 0 3 1 2 2 0 1 ...
#adding new column results which gives the match results of home team by comparing home_team_goals with away_team goals like if if home team score more goals than away then it win and if less then loss and if equal the draw 
a$results <- a$home_team_goal
a$results[a$home_team_goal>a$away_team_goal] <- "Win"
a$results[a$home_team_goal<a$away_team_goal] <- "Loss"
a$results[a$home_team_goal==a$away_team_goal] <- "Draw"
#subsetting the table by selecting only required columns and storing in team_A for home team
team_A<-a[c(1,2,3,7)]
library(dplyr)
#rename the column name home_team_api_id to team_api_id
team_A<-team_A %>% dplyr::rename(team_api_id = home_team_api_id)
str(team_A)
## Classes 'tbl_df', 'tbl' and 'data.frame':    25979 obs. of  4 variables:
##  $ date        : chr  "2008-08-17 00:00:00" "2008-08-16 00:00:00" "2008-08-16 00:00:00" "2008-08-17 00:00:00" ...
##  $ match_api_id: int  492473 492474 492475 492476 492477 492478 492479 492480 492481 492564 ...
##  $ team_api_id : int  9987 10000 9984 9991 7947 8203 9999 4049 10001 8342 ...
##  $ results     : chr  "Draw" "Draw" "Loss" "Win" ...

Summary: team A table is derived from Match table which has information about the date of match result of match for home team with nnnnnnnn..l.;pnapi id for home team and match api id. ##extract data from match table for away teams win loss or draw Here, i am going to extract the results for home from match table as it has values for number of goals for away and away country.

#subsetting the table by selecting required rows and storing into datafram b
b <- Match[,6:11]
#adding new column results which gives the match results of away team by comparing away_team_goals with home_team goals like if away team score more goals than home team then it win and if less then loss and if equal the draw 
b$results <- b$away_team_goal
b$results[b$home_team_goal<b$away_team_goal] <- "Win"
b$results[b$home_team_goal>b$away_team_goal] <- "Loss"
b$results[b$home_team_goal==b$away_team_goal] <- "Draw"
#subsetting the table by selecting only required columns and storing in team_B for away team
team_B<-b[c(1,2,4,7)]
#rename the column name away_team_api_id to team_api_id
team_B<-team_B %>% dplyr::rename(team_api_id = away_team_api_id)
#team_B<-rename(team_B, "team_api_id"="away_team_api_id")
str(team_A)
## Classes 'tbl_df', 'tbl' and 'data.frame':    25979 obs. of  4 variables:
##  $ date        : chr  "2008-08-17 00:00:00" "2008-08-16 00:00:00" "2008-08-16 00:00:00" "2008-08-17 00:00:00" ...
##  $ match_api_id: int  492473 492474 492475 492476 492477 492478 492479 492480 492481 492564 ...
##  $ team_api_id : int  9987 10000 9984 9991 7947 8203 9999 4049 10001 8342 ...
##  $ results     : chr  "Draw" "Draw" "Loss" "Win" ...

Summary: team B table is derived from Match table which has information about the date of match ,result of match for away team with api id for away team and match api id. ## join tables to get results for all teams for win loss and draw Now after extracting the results for home and away team i am going to joint the tables into single table for all teams results for further visualisations.

#merging the results of home team and away team into single table for further calculations by match api id
team_results <- merge(team_A,team_B,by="match_api_id")
head(team_results)
##   match_api_id              date.x team_api_id.x results.x
## 1       483129 2008-08-09 00:00:00          8583       Win
## 2       483130 2008-08-09 00:00:00          9827       Win
## 3       483131 2008-08-09 00:00:00          9746       Win
## 4       483132 2008-08-09 00:00:00          8682      Loss
## 5       483133 2008-08-10 00:00:00          9748       Win
## 6       483134 2008-08-09 00:00:00          9829       Win
##                date.y team_api_id.y results.y
## 1 2008-08-09 00:00:00          9830      Loss
## 2 2008-08-09 00:00:00          7819      Loss
## 3 2008-08-09 00:00:00          9831      Loss
## 4 2008-08-09 00:00:00          8689       Win
## 5 2008-08-10 00:00:00          9941      Loss
## 6 2008-08-09 00:00:00          9847      Loss
#binding the two tables of home teams results and away teams results into one table
z <- rbind(team_A,team_B)
str(z)
## Classes 'tbl_df', 'tbl' and 'data.frame':    51958 obs. of  4 variables:
##  $ date        : chr  "2008-08-17 00:00:00" "2008-08-16 00:00:00" "2008-08-16 00:00:00" "2008-08-17 00:00:00" ...
##  $ match_api_id: int  492473 492474 492475 492476 492477 492478 492479 492480 492481 492564 ...
##  $ team_api_id : int  9987 10000 9984 9991 7947 8203 9999 4049 10001 8342 ...
##  $ results     : chr  "Draw" "Draw" "Loss" "Win" ...
library(dplyr)
#getting the number of wins loss and draws of each team
z1 <- with(z,table(team_api_id, results))
z1<-as.data.frame.matrix(z1) 
z1 <- add_rownames(z1, "team_api_id")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
head(z1)
## # A tibble: 6 x 4
##   team_api_id  Draw  Loss   Win
##         <chr> <int> <int> <int>
## 1        1601    57    91    92
## 2        1773    28    40    22
## 3        1957    64    89    87
## 4        2033    55    63    32
## 5        2182    60    56   124
## 6        2183    38    52    60

First table gives the results for all teams in single table after connecting both tables of results of home and away. last table give the information about the results of the number of wins,loss and draws for each teams. ## visualise all the team A(home team)featues with respect to win draw and loss classes.

#join table team_results of team_A(home team) and team features by team_api_id and date
A_vis<-merge(team_A,team_features
         , by=c("team_api_id", "date"))
#convert the results column from char type to factors
A_vis$results=as.factor(A_vis$results)
# load the libraries
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following object is masked from 'package:modeltools':
## 
##     prior
library(ddalpha)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: class
## Loading required package: robustbase
library(caret)
## Loading required package: lattice
# Box plots for each attribute by class value
featurePlot(x=A_vis[, 5:13], y=A_vis$results, plot="box", scales=list(x=list(relation="free"), y=list(relation="free")),adjust=0.5,pch = "|",auto.key = list(columns = 2), main="Boxplot for home teams features")

Summary: plot gives the infoirmation about the featues of home team for three classes win, loss and draw. mean of shooting for all of thre classes is almost same which means this features does not contribute towards the match result of home team. But when the defence pressure against home teams is low that time home team wins but when its high against home team that time away team wins and when its very low that team its draw. The mean of Defecnce aggression and defence team width against home team is almost same for when home team wins and loss. when play speed varies and went highest in cases when home team wins. Hence, defence pressure is very good feature for results of home teams as there was low pressure on home team agianst away when home team win. ## visualise all the team B(away team) featues with respect to win draw and loss classes.

#join table team_results of team_B(away team) and team features by team_api_id and date
B_vis<-merge(team_B,team_features
         , by= c("team_api_id", "date"))
#convert the results column from char type to factors
B_vis$results=as.factor(B_vis$results)
# load the libraries
library(kernlab)
library(ddalpha)
library(caret)

# box plots for each attribute by class value
featurePlot(x=B_vis[, 5:13], y=B_vis$results, plot="box", scales=list(x=list(relation="free"), y=list(relation="free")),adjust=0.5,pch = "|",auto.key = list(columns = 2),main="Boxplot for away teams features")

Summary: this plot gives the infoirmation about the featues of away team for three classes win, loss and draw. if we compare this plot with previous home vs away its clear that defence pressure and aggresion of home team is always is very high aginst away team. now, when defence pressure of home team against away team is high that time away team losses the match when its low that time away team wins the match but opposite trend is obeserved in case of defence agresssion like when home teams agression is high that time more likey to chance of away team wins and when its low then home team wins the match. asso, when home teams defence width id high that time its draw but with high value of home teams defence width it wins and with low it losses and vice versa. Away team wins when creation passing is high and loss when its lower ## visualise features for all teams wrt W/D/L box plot

#join table team_results of all teams(home and away teams) and team features by team_api_id and date
vis_features<-merge(z,team_features
         , by=c("team_api_id", "date"))
#convert the results column from char type to factors
vis_features$results=as.factor(vis_features$results)
# feature plot for each attribute by class value
featurePlot(x=vis_features[, 5:13], y=vis_features$results, plot="box", scales=list(x=list(relation="free"), y=list(relation="free")),pch = "|", main="Boxplot for all teams features")

This plot is giving information about all the teams(home and away) for all features against home win and loss class. overall, there is more chances to be winner of match if team has less defence pressure and if its high there is more chance to loss a match. But opposite trend and interesting fact can be seen when it comes to defence aggresion like when defence teamds shows more aggresion then team wins agains aggression but losses when its low. ## visualise features wrt W/D/L density plot

# density plot w.r.t win, loss and draw class
featurePlot(x=vis_features[c(5,7:13)], y=vis_features$results, plot="density", scales=list(x=list(relation="free"), y=list(relation="free")),pch = "|",auto.key = list(space = "top"), main="density plot for home teams features")

here from density plot, there is no strong relationship for win class and all features of teams but team will always loss when there is high defence aggressoion and pressure and play passing. and its draw when there is high creation shooting and creation passing. ## finding co-relation of all features of teams with loss and win class

#subsetting the dataset by deleting unwanted columns
team_feature <- (team_features[,c(-2,-4)])
#taking the mean of all features w.r.t team_api_id
mean_team <- team_feature [ ,lapply(.SD, mean),  by = .(team_api_id)]
#merging z table which contains results of all teams and mean_team table which contains mean of all features by team_api_id
cor_teams<-merge(z,mean_team
         , by=c("team_api_id"))
#convert the results column from char type to factors
cor_teams$results=as.factor(cor_teams$results)
# finding correlation using ggcorr
library(corrplot)
#substting the dataset by selecting the data only for win class
win<-subset(cor_teams, results=="Win")
win <-cor(win[,5:12])
title <- "Correlation in features W.R.T. win"
#plotting correction plot for all features w.r.t win class
corrplot(win, method = "shade", order="hclust", addrect=2, title = title,mar=c(0,0,1,0) )

#substting the dataset by selecting the data only for loss class
loss <-subset(cor_teams, results=="Loss")
loss <-cor(loss[,5:12])
title <- "Correlation in features W.R.T. loss"
#plotting correction plot for all features w.r.t loss class
corrplot(loss, method = "shade", order="hclust", addrect=2, title = title,mar=c(0,0,1,0) )

here Correlation in features W.R.T. win shows that play speed has positive co-realtion withplay passing, creation passing and defence agression like when they highly co related with positive realtion then there is high chance to win. moreover, crossing has also positive co relation for win class with creation passing. there is negative co relation between defence pressure and play passing in case of win. when observing against loss class,play speed has positive relation with play passing and creation passing. but play passing has negaive co reation with creation shooting and defenec pressure but defence pressure and defence aggresion has positive co relation for loss class. which means when efence pressure and defence aggresion are positively co related then there is more chance that team will loss the match. ## PREDICTION MODEL

preparing dataset for prediction model

# a is a dataset which contains results for home team so here i am adding new column "team_type" and giving value "home"
a$team_type<-"home"
#subsetting the dataset by selecting the required columns and storing them in c1 dataframe
c1<-a[,c(3,4,7,8)]
#convert the results column from char type to factor
c1$results=as.factor(c1$results)
#convert the team_type column from char type to factor
c1$team_type=as.factor(c1$team_type)
#adding new column for speed of home teams and extracting the speed values by matching home_team_api_id from c1 table with team_id from mean_team table
c1$speed_home_team<-mean_team$buildUpPlaySpeed[match(c1$home_team_api_id, mean_team$team_api_id)]
#adding new column for speed of away teams and extracting the speed values by matching away_team_api_id from c1 table with team_id from mean_team table
c1$speed_away_team<-mean_team$buildUpPlaySpeed[match(c1$away_team_api_id, mean_team$team_api_id)]
#adding a new column for speed difference for home teams from away teams
c1$speed_diff <- (c1$speed_home_team - c1$speed_away_team)
#adding new column for playPassing of home teams and extracting the playPassing values by matching home_team_api_id from c1 table with team_id from mean_team table
c1$playPassing_home_team<-mean_team$buildUpPlayPassing[match(c1$home_team_api_id, mean_team$team_api_id)]
#adding new column for playPassing of away teams and extracting the playPassing values by matching away_team_api_id from c1 table with team_id from mean_team table
c1$playPassing_away_team<-mean_team$buildUpPlayPassing[match(c1$away_team_api_id, mean_team$team_api_id)]
#adding a new column for playPassing difference for home teams from away teams
c1$playPassing_diff <- (c1$playPassing_home_team - c1$playPassing_away_team)
#adding new column for creationPassing of home teams and extracting the creationPassing values by matching home_team_api_id from c1 table with team_id from mean_team table
c1$creationPassing_home_team<-mean_team$chanceCreationPassing[match(c1$home_team_api_id, mean_team$team_api_id)]
#adding new column for creationPassing of away teams and extracting the creationPassing values by matching away_team_api_id from c1 table with team_id from mean_team table
c1$creationPassing_away_team<-mean_team$chanceCreationPassing[match(c1$away_team_api_id, mean_team$team_api_id)]
#adding a new column for creationPassing difference for home teams from away teams
c1$creationPassing_diff <- (c1$creationPassing_home_team - c1$creationPassing_away_team)
#adding new column for Crossing of home teams and extracting the Crossing values by matching home_team_api_id from c1 table with team_id from mean_team table
c1$Crossing_home_team<-mean_team$chanceCreationCrossing[match(c1$home_team_api_id, mean_team$team_api_id)]
#adding new column for Crossing of away teams and extracting the Crossing values by matching away_team_api_id from c1 table with team_id from mean_team table
c1$Crossing_away_team<-mean_team$chanceCreationCrossing[match(c1$away_team_api_id, mean_team$team_api_id)]
#adding a new column for Crossing difference for home teams from away teams
c1$crossing_diff <- (c1$Crossing_home_team - c1$Crossing_away_team)
#adding new column for shooting of home teams and extracting the shooting values by matching home_team_api_id from c1 table with team_id from mean_team table
c1$shooting_home_team<-mean_team$chanceCreationShooting[match(c1$home_team_api_id, mean_team$team_api_id)]
#adding new column for shooting of away teams and extracting the shooting values by matching away_team_api_id from c1 table with team_id from mean_team table
c1$shooting_away_team<-mean_team$chanceCreationShooting[match(c1$away_team_api_id, mean_team$team_api_id)]
#adding a new column for shooting difference for home teams from away teams
c1$shooting_diff <- (c1$shooting_home_team - c1$shooting_away_team)
#adding new column for defencePressure of home teams and extracting the defencePressure values by matching home_team_api_id from c1 table with team_id from mean_team table
c1$defencePressure_home_team<-mean_team$defencePressure[match(c1$home_team_api_id, mean_team$team_api_id)]
#adding new column for defencePressure of away teams and extracting the defencePressure values by matching away_team_api_id from c1 table with team_id from mean_team table
c1$defencePressure_away_team<-mean_team$defencePressure[match(c1$away_team_api_id, mean_team$team_api_id)]
#adding a new column for defencePressure difference for home teams from away teams
c1$defencePressure_diff<-(c1$defencePressure_home_team-c1$defencePressure_away_team)
#adding new column for defenceAggression of home teams and extracting the defenceAggressione values by matching home_team_api_id from c1 table with team_id from mean_team table
c1$defenceAggression_home_team<-mean_team$defenceAggression[match(c1$home_team_api_id, mean_team$team_api_id)]
#adding new column for defenceAggression of away teams and extracting the defenceAggression values by matching away_team_api_id from c1 table with team_id from mean_team table
c1$defenceAggression_away_team<-mean_team$defenceAggression[match(c1$away_team_api_id, mean_team$team_api_id)]
#adding a new column for defenceAggression difference for home teams from away teams
c1$defenceAggression_diff<-(c1$defenceAggression_home_team-c1$defenceAggression_away_team)
#adding new column for defenceTeamWidth of home teams and extracting the defenceTeamWidth values by matching home_team_api_id from c1 table with team_id from mean_team table
c1$defenceTeamWidth_home_team<-mean_team$defenceTeamWidth[match(c1$home_team_api_id, mean_team$team_api_id)]
#adding new column fordefenceTeamWidth of away teams and extracting the defenceTeamWidth values by matching away_team_api_id from c1 table with team_id from mean_team table
c1$defenceTeamWidth_away_team<-mean_team$defenceTeamWidth[match(c1$away_team_api_id, mean_team$team_api_id)]
#adding a new column for defenceTeamWidth difference for home teams from away teams
c1$defenceTeamWidth_diff<-(c1$defenceTeamWidth_home_team-c1$defenceTeamWidth_away_team)
#data cleaning i.e. removing the rows having NA/null values from c1 dataframe 
c1<-na.omit(c1)
# b is a dataset which contains results for away team so here i am adding new column "team_type" and giving value "away"
b$team_type<-"away"
#subsetting the dataset by selecting the required columns and storing them in c2 dataframe
c2<-b[,c(3,4,7,8)]
#convert the results column from char type to factor
c2$results=as.factor(c2$results)
#convert the team_type column from char type to factor
c2$team_type=as.factor(c2$team_type)
#adding new column for speed of away teams and extracting the speed values by matching home_team_api_id from c2 table with team_id from mean_team table
c2$speed_home_team<-mean_team$buildUpPlaySpeed[match(c2$home_team_api_id, mean_team$team_api_id)]
#adding new column for speed of away teams and extracting the speed values by matching away_team_api_id from c2 table with team_id from mean_team table
c2$speed_away_team<-mean_team$buildUpPlaySpeed[match(c2$away_team_api_id, mean_team$team_api_id)]
#adding a new column for speed difference for away teams from home teams
c2$speed_diff <- (c2$speed_away_team - c2$speed_home_team)
#adding new column for playPassing of home teams and extracting the playPassing values by matching home_team_api_id from c2 table with team_id from mean_team table
c2$playPassing_home_team<-mean_team$buildUpPlayPassing[match(c2$home_team_api_id, mean_team$team_api_id)]
#adding new column for playPassing of away teams and extracting the playPassing values by matching away_team_api_id from c2 table with team_id from mean_team table
c2$playPassing_away_team<-mean_team$buildUpPlayPassing[match(c2$away_team_api_id, mean_team$team_api_id)]
#adding a new column for playPassing difference for away teams from home teams
c2$playPassing_diff <- (c2$playPassing_away_team - c2$playPassing_home_team)
#adding new column for creationPassing of home teams and extracting the creationPassing values by matching home_team_api_id from c2 table with team_id from mean_team table
c2$creationPassing_home_team<-mean_team$chanceCreationPassing[match(c2$home_team_api_id, mean_team$team_api_id)]
#adding new column for creationPassing of away teams and extracting the creationPassing values by matching away_team_api_id from c2 table with team_id from mean_team table
c2$creationPassing_away_team<-mean_team$chanceCreationPassing[match(c2$away_team_api_id, mean_team$team_api_id)]
#adding a new column for creationPassing difference for away teams from home teams
c2$creationPassing_diff <- (c2$creationPassing_away_team - c2$creationPassing_home_team)
#adding new column for Crossing of home teams and extracting the Crossing values by matching home_team_api_id from c2 table with team_id from mean_team table
c2$Crossing_home_team<-mean_team$chanceCreationCrossing[match(c2$home_team_api_id, mean_team$team_api_id)]
#adding new column for Crossing of away teams and extracting the Crossing values by matching away_team_api_id from c2 table with team_id from mean_team table
c2$Crossing_away_team<-mean_team$chanceCreationCrossing[match(c2$away_team_api_id, mean_team$team_api_id)]
#adding a new column for Crossing difference for away teams from home teams
c2$crossing_diff <- (c2$Crossing_away_team - c2$Crossing_home_team)
#adding new column for shooting of home teams and extracting the shooting values by matching home_team_api_id from c2 table with team_id from mean_team table
c2$shooting_home_team<-mean_team$chanceCreationShooting[match(c2$home_team_api_id, mean_team$team_api_id)]
#adding new column for shooting of away teams and extracting the shooting values by matching away_team_api_id from c2 table with team_id from mean_team table
c2$shooting_away_team<-mean_team$chanceCreationShooting[match(c2$away_team_api_id, mean_team$team_api_id)]
#adding a new column for shooting difference for away teams from home teams
c2$shooting_diff <- (c2$shooting_away_team - c2$shooting_home_team)
#adding new column for defencePressure of home teams and extracting the defencePressure values by matching home_team_api_id from c2 table with team_id from mean_team table
c2$defencePressure_home_team<-mean_team$defencePressure[match(c2$home_team_api_id, mean_team$team_api_id)]
#adding new column for defencePressure of away teams and extracting the defencePressure values by matching away_team_api_id from c2 table with team_id from mean_team table
c2$defencePressure_away_team<-mean_team$defencePressure[match(c2$away_team_api_id, mean_team$team_api_id)]
#adding a new column for defencePressure difference for away teams from home teams
c2$defencePressure_diff<-(c2$defencePressure_away_team-c2$defencePressure_home_team)
#adding new column for defenceAggression of home teams and extracting the defenceAggressione values by matching home_team_api_id from c2 table with team_id from mean_team table
c2$defenceAggression_home_team<-mean_team$defenceAggression[match(c2$home_team_api_id, mean_team$team_api_id)]
#adding new column for defenceAggression of away teams and extracting the defenceAggression values by matching away_team_api_id from c2 table with team_id from mean_team table
c2$defenceAggression_away_team<-mean_team$defenceAggression[match(c2$away_team_api_id, mean_team$team_api_id)]
#adding a new column for defenceAggression difference for away teams from home teams
c2$defenceAggression_diff<-(c2$defenceAggression_away_team-c2$defenceAggression_home_team)
#adding new column for defenceTeamWidth of home teams and extracting the defenceTeamWidth values by matching home_team_api_id from c2 table with team_id from mean_team table
c2$defenceTeamWidth_home_team<-mean_team$defenceTeamWidth[match(c2$home_team_api_id, mean_team$team_api_id)]
#adding new column fordefenceTeamWidth of away teams and extracting the defenceTeamWidth values by matching away_team_api_id from c2 table with team_id from mean_team table
c2$defenceTeamWidth_away_team<-mean_team$defenceTeamWidth[match(c2$away_team_api_id, mean_team$team_api_id)]
#adding a new column for defenceTeamWidth difference for away teams from home teams
c2$defenceTeamWidth_diff<-(c2$defenceTeamWidth_away_team-c2$defenceTeamWidth_home_team)
#data cleaning i.e. removing the rows having NA/null values from c2 dataframe 
c2<-na.omit(c2)
#binding dataframe c1 and c2 c1
c<-rbind(c1,c2)
c<-c[c(3,4,7,10,13,16,19,22,25,28)]
str(c)
## Classes 'tbl_df', 'tbl' and 'data.frame':    51258 obs. of  10 variables:
##  $ results               : Factor w/ 3 levels "Draw","Loss",..: 1 1 2 3 1 1 3 3 2 2 ...
##  $ team_type             : Factor w/ 2 levels "home","away": 1 1 1 1 1 1 1 1 1 1 ...
##  $ speed_diff            : num  10.33 -8.5 2.33 2.5 7.5 ...
##  $ playPassing_diff      : num  2.833 -0.833 -1 -5.333 0.833 ...
##  $ creationPassing_diff  : num  9.67 6 -4.5 -10.5 -1.33 ...
##  $ crossing_diff         : num  -1.17 1.67 5.33 4 -6 ...
##  $ shooting_diff         : num  2.08 -12.83 -3.33 -1.83 2.67 ...
##  $ defencePressure_diff  : num  0.917 -0.167 0.5 5.333 -6 ...
##  $ defenceAggression_diff: num  1.67 1 2 5 5 ...
##  $ defenceTeamWidth_diff : num  1.083 -7.667 0.833 -1.333 -2.667 ...
#c<- subset(c,results!=c('Draw'))

here i prepared the dataset for prediction model. for prediction model i am going to consider the differnce of all the features between team A and team B and also weather team is home or away.C table has information about the results of each team, team type and difference of all features with team api id. ## PREDICTION MODEL : DECISION TREE

library(caret)
# "seed" a random number generator 
set.seed(1234)

# Split the data into 70% training, 30% test
ind <- sample(2, nrow(c), replace=TRUE, prob=c(0.7, 0.3))
train_data <- c[ind==1,]
test_data <- c[ind==2,]
#classify match data 
formula <- results ~  team_type + speed_diff + playPassing_diff + creationPassing_diff + crossing_diff + shooting_diff + defencePressure_diff + defenceAggression_diff + defenceTeamWidth_diff
library(partykit)
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal
#decision tree for data
Ctree <- ctree(formula, data = train_data)
#prediction table
table(predict(Ctree, newdata = test_data), test_data$results)
##       
##        Draw Loss  Win
##   Draw    0    0    0
##   Loss 1899 3493 2053
##   Win  2036 2239 3675
plot(Ctree, gp = gpar(fontsize = 50),     # font size changed to 50
  inner_panel=node_inner,
  ip_args=list(
       abbreviate = TRUE, 
       id = FALSE))

#confusion matrix for dataset
confusionMatrix(predict(Ctree, newdata = test_data), test_data$results)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Draw Loss  Win
##       Draw    0    0    0
##       Loss 1899 3493 2053
##       Win  2036 2239 3675
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4656          
##                  95% CI : (0.4577, 0.4735)
##     No Information Rate : 0.3723          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.1488          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: Draw Class: Loss Class: Win
## Sensitivity               0.0000      0.6094     0.6416
## Specificity               1.0000      0.5910     0.5578
## Pos Pred Value               NaN      0.4692     0.4623
## Neg Pred Value            0.7444      0.7184     0.7242
## Prevalence                0.2556      0.3723     0.3721
## Detection Rate            0.0000      0.2269     0.2387
## Detection Prevalence      0.0000      0.4836     0.5164
## Balanced Accuracy         0.5000      0.6002     0.5997

summary: decision tree model’s accuracy rate is 46.56% for three classes loss, win and draw, but this is not predicting draw class accurately. it gives 7168 correct predictions/results for win and loss class out of 15395. it is very clear that it is very diificult to predict draw class and giving 2036 wrong results for draw class. deciion tree start node is weather team is home or away. ## PREDICTION MODEL : SVM model

Model_SVM <- svm(results ~  team_type + speed_diff + playPassing_diff + creationPassing_diff + crossing_diff + shooting_diff + defencePressure_diff + defenceAggression_diff + defenceTeamWidth_diff, data = train_data, kernel = "linear", cost = 0.1, gamma = 0.1)
summary(Model_SVM)
## 
## Call:
## svm(formula = results ~ team_type + speed_diff + playPassing_diff + 
##     creationPassing_diff + crossing_diff + shooting_diff + defencePressure_diff + 
##     defenceAggression_diff + defenceTeamWidth_diff, data = train_data, 
##     kernel = "linear", cost = 0.1, gamma = 0.1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
##       gamma:  0.1 
## 
## Number of Support Vectors:  31717
## 
##  ( 9085 11581 11051 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  Draw Loss Win
table(predict( Model_SVM, newdata= test_data), test_data$results) 
##       
##        Draw Loss  Win
##   Draw    0    0    0
##   Loss 1943 3476 2260
##   Win  1992 2256 3468
confusionMatrix(predict(Model_SVM, newdata = test_data), test_data$results)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Draw Loss  Win
##       Draw    0    0    0
##       Loss 1943 3476 2260
##       Win  1992 2256 3468
## 
## Overall Statistics
##                                          
##                Accuracy : 0.4511         
##                  95% CI : (0.4432, 0.459)
##     No Information Rate : 0.3723         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.1256         
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Draw Class: Loss Class: Win
## Sensitivity               0.0000      0.6064     0.6054
## Specificity               1.0000      0.5650     0.5606
## Pos Pred Value               NaN      0.4527     0.4495
## Neg Pred Value            0.7444      0.7076     0.7057
## Prevalence                0.2556      0.3723     0.3721
## Detection Rate            0.0000      0.2258     0.2253
## Detection Prevalence      0.0000      0.4988     0.5012
## Balanced Accuracy         0.5000      0.5857     0.5830

Summary: SVM model’s accuracy rate is 45.11% which is lower than the decision tree model. there are 31717 support vectors for this model and it gives 6944 correct prediction results out of 15395. ##prediction model: random forest

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
#classify match data #Applying the formula for Random Forest Algorithm
RF <- results ~  team_type + speed_diff + playPassing_diff + creationPassing_diff + crossing_diff + shooting_diff + defencePressure_diff + defenceAggression_diff + defenceTeamWidth_diff
#random forest model for dataset 
RF_model <- randomForest(RF, data = train_data , ntree= 100, proximity = T)
print(RF_model)
## 
## Call:
##  randomForest(formula = RF, data = train_data, ntree = 100, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 56.31%
## Confusion matrix:
##      Draw Loss  Win class.error
## Draw 1808 3668 3609   0.8009906
## Loss 2683 6919 3785   0.4831553
## Win  2621 3828 6942   0.4815921
#To plot the Variable Importance Plot.
ImportancePlot <- varImpPlot(RF_model, main = "Importance of each variable")

RFTest <- predict(RF_model, newdata =test_data)
#prediction table
TestPredictionTable <- table(RFTest, test_data$results)
print(TestPredictionTable)
##       
## RFTest Draw Loss  Win
##   Draw  788 1092 1113
##   Loss 1597 3004 1681
##   Win  1550 1636 2934
#confusion matrix 
confusionMatrix(RFTest, test_data$results)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Draw Loss  Win
##       Draw  788 1092 1113
##       Loss 1597 3004 1681
##       Win  1550 1636 2934
## 
## Overall Statistics
##                                          
##                Accuracy : 0.4369         
##                  95% CI : (0.429, 0.4448)
##     No Information Rate : 0.3723         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.1343         
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Draw Class: Loss Class: Win
## Sensitivity              0.20025      0.5241     0.5122
## Specificity              0.80759      0.6608     0.6704
## Pos Pred Value           0.26328      0.4782     0.4794
## Neg Pred Value           0.74625      0.7006     0.6988
## Prevalence               0.25560      0.3723     0.3721
## Detection Rate           0.05119      0.1951     0.1906
## Detection Prevalence     0.19441      0.4081     0.3975
## Balanced Accuracy        0.50392      0.5924     0.5913

Summary: Random forest model gives 43.69% accuracy rate and error rate is 56.31%. it gives highest error rate(approx. 80%) for draw class as it is unable to predict correctly.error rate for win and loss is 48.15% and 48.31% respectively.this model is giving total of 6726 correct results out of 15395. Conclusion: out of above three prediction models decision tree model gives higest accuracy rate but decision tree and SVM completely giving wrong results for draw class however, random forest is only model which is predicting all of three classes but giving 43.7% accuracy rate. so, random forest can be considered as good prediction model when we considering all of three classes and decision tree gives more accuracy when its two classes win and loss.